from bs4 import BeautifulSoup
from ..items import ItblogItem
import scrapy



class CnblogSpider(scrapy.Spider):
    name = "cnblog"
    allowed_domains = ["www.cnblogs.com"]
    start_urls = ["https://www.cnblogs.com"]

    def parse(self, response):
        bs = BeautifulSoup(response.text, "lxml")
        sections = bs.select("#post_list article.post-item section.post-item-body")
        for section in sections:
            item = ItblogItem()
            # 博客标题详情页地址
            a = section.select_one("div.post-item-text a.post-item-title")
            item["detail_url"] = a["href"]
            item["title"] = a.text
            # 摘要
            description = section.select_one("div.post-item-text p.post-item-summary").text
            # 去掉摘要空白
            item["description"] = description.strip("\n\t ...")
            # 博客作者
            item["author"] = section.select_one("footer.post-item-foot a.post-item-author span").text
            item["publish_time"] = section.select_one("footer.post-item-foot span.post-meta-item span").text


            yield item
